set.seed(2021)

Load scripts and packages.

library(SingleCellMultiModal)
library(MultiAssayExperiment)
library(UpSetR)
source("../scripts/initialise.R")

Load data using SingleCellMultiModal Bioconductor package.

mae <- scMultiome("pbmc_10x", mode = "*", dry.run = FALSE, format = "MTX")

Perform some exploration of this data.

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
colors <- gg_color_hue(length(unique(mae$celltype)))
names(colors) <- unique(mae$celltype)

mae
## A MultiAssayExperiment object of 2 listed
##  experiments with user-defined names and respective classes.
##  Containing an ExperimentList class object of length 2:
##  [1] atac: SingleCellExperiment with 108344 rows and 10032 columns
##  [2] rna: SingleCellExperiment with 36549 rows and 10032 columns
## Functionality:
##  experiments() - obtain the ExperimentList instance
##  colData() - the primary/phenotype DataFrame
##  sampleMap() - the sample coordination DataFrame
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment
##  *Format() - convert into a long or wide DataFrame
##  assays() - convert ExperimentList to a SimpleList of matrices
##  exportClass() - save all data to files
upsetSamples(mae)

head(colData(mae))
## DataFrame with 6 rows and 6 columns
##                  nCount_RNA nFeature_RNA nCount_ATAC nFeature_ATAC
##                   <integer>    <integer>   <integer>     <integer>
## AAACAGCCAAGGAATC       8380         3308       55582         13878
## AAACAGCCAATCCCTT       3771         1896       20495          7253
## AAACAGCCAATGCGCT       6876         2904       16674          6528
## AAACAGCCAGTAGGTG       7614         3061       39454         11633
## AAACAGCCAGTTTACG       3633         1691       20523          7245
## AAACAGCCATCCAGGT       7782         3028       22412          8602
##                                celltype broad_celltype
##                             <character>    <character>
## AAACAGCCAAGGAATC      naive CD4 T cells       Lymphoid
## AAACAGCCAATCCCTT     memory CD4 T cells       Lymphoid
## AAACAGCCAATGCGCT      naive CD4 T cells       Lymphoid
## AAACAGCCAGTAGGTG      naive CD4 T cells       Lymphoid
## AAACAGCCAGTTTACG     memory CD4 T cells       Lymphoid
## AAACAGCCATCCAGGT non-classical monocy..        Myeloid
dim(experiments(mae)[["rna"]])
## [1] 36549 10032
names(experiments(mae))
## [1] "atac" "rna"
sce.rna <- experiments(mae)[["rna"]]

Normalise and select features for RNA modality. Perform PCA and visualise the RNA modality.

sce.rna <- logNormCounts(sce.rna)

decomp <- modelGeneVar(sce.rna)
hvgs <- rownames(decomp)[decomp$mean>0.01 & decomp$p.value <= 0.05]
sce.rna <- sce.rna[hvgs,]

# PCA
sce.rna <- scater::runPCA(sce.rna, ncomponents = 25)

# UMAP
sce.rna <- scater::runUMAP(sce.rna, dimred="PCA", n_neighbors = 25, min_dist = 0.3)
plotUMAP(sce.rna, colour_by="celltype", point_size=0.5, point_alpha=1)

Normalise and select features for ATAC modality. Perform PCA and visualise the ATAC modality. Note these cells are the same as the RNA modality.

dim(experiments(mae)[["atac"]])
## [1] 108344  10032
sce.atac <- experiments(mae)[["atac"]]

sce.atac <- logNormCounts(sce.atac)

decomp <- modelGeneVar(sce.atac)
hvgs <- rownames(decomp)[decomp$mean>0.25 & decomp$p.value <= 0.05]
sce.atac <- sce.atac[hvgs,]

sce.atac <- scater::runPCA(sce.atac, ncomponents = 25)

# UMAP
sce.atac <- scater::runUMAP(sce.atac, dimred="PCA", n_neighbors = 25, min_dist = 0.3)
plotUMAP(sce.atac, colour_by="celltype", point_size=0.5, point_alpha=1)

Split the cells into three groups with ratios provided below, first for RNA modality, second for multiome modality, and third for ATAC modality. Using this simulation, perform StabMap. Note that other techniques such as PCA, UINMF, or MultiMAP require some further matching of features between the RNA and ATAC modalities to be able to run.

Assess the quality of the mapping visually.

gList = list()

for (probval in c(1:10)) {
  
  print(probval)
  
  modality = sample(c("RNA", "Multiome", "ATAC"), size = ncol(sce.rna),
                    replace = TRUE,
                    prob = c(probval,1,probval))
  names(modality) <- colnames(sce.rna)
  table(modality)
  
  # multiome RNA
  M_R = assay(sce.rna, "logcounts")[,modality == "Multiome"]
  
  # multiome ATAC
  M_A = assay(sce.atac, "logcounts")[,modality == "Multiome"]
  
  # RNA
  R = assay(sce.rna, "logcounts")[,modality == "RNA"]
  
  # ATAC
  A = assay(sce.atac, "logcounts")[,modality == "ATAC"]
  
  # they dont have any overlapping features
  Reduce(intersect, list(c(rownames(M_R), rownames(M_A)), rownames(R), rownames(A)))
  
  # there should be no overlapping colnames between the data,
  # but colnames should be identical between the two multiome sub matrices
  identical(colnames(M_R), colnames(M_A))
  intersect(colnames(M_R), colnames(R))
  intersect(colnames(M_R), colnames(A))
  intersect(colnames(R), colnames(A))
  
  assay_list = list(RNA = R,
                    ATAC = A,
                    Multiome = rbind(M_R, M_A))
  
  out_joint = stabMapGeneralised(assay_list,
                                 reference_list = c("Multiome"),
                                 projectAll = TRUE,
                                 plot = FALSE)
  
  # out_joint_UMAP = calculateUMAP_rnames(out_joint)
  
  out_joint_corrected = reducedMNN_batchFactor(
    as.matrix(out_joint), batchFactor = modality[rownames(out_joint)])
  out_joint_UMAP_corrected = calculateUMAP_rnames(out_joint_corrected)
  
  
  out_joint_df = data.frame(cell = names(modality),
                            modality = modality,
                            # UMAP1 = out_joint_UMAP[names(modality),1],
                            # UMAP2 = out_joint_UMAP[names(modality),2],
                            celltype = colData(sce.rna)[names(modality),"celltype"],
                            UMAP1_corrected = out_joint_UMAP_corrected[names(modality),1],
                            UMAP2_corrected = out_joint_UMAP_corrected[names(modality),2])
  out_joint_df <- out_joint_df[sample(rownames(out_joint_df)),]
  
  g = ggplot(out_joint_df, aes(x = modality)) + 
    geom_bar(aes(fill = modality)) +
    ylab("Number of cells") +
    theme_classic() +
    ggplot(out_joint_df, aes(x = UMAP1_corrected, y = UMAP2_corrected)) + 
    geom_point(aes(colour = modality)) +
    theme_classic() +
    ggplot(out_joint_df, aes(x = UMAP1_corrected, y = UMAP2_corrected)) + 
    geom_point(aes(colour = celltype)) +
    theme_classic() +
    ggplot(out_joint_df, aes(x = UMAP1_corrected, y = UMAP2_corrected)) + 
    geom_point(aes(colour = celltype)) +
    facet_wrap(~modality) +
    theme_classic() +
    plot_layout(design = c("
                         123
                         444
                         ")) +
    NULL
  
  gList[[probval]] <- g
  
  print(g)
}
## [1] 1
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 2
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 3
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 4
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 5
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 6
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 7
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 8
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 9
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

## [1] 10
## [1] "fitting linear models"
## [1] "fitted linear models"
## [1] "fitting linear models"
## [1] "fitted linear models"

Finish

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] igraph_1.2.6                batchelor_1.8.0            
##  [3] patchwork_1.1.1             scran_1.20.0               
##  [5] scater_1.20.0               ggplot2_3.3.3              
##  [7] scuttle_1.2.0               SingleCellExperiment_1.14.1
##  [9] UpSetR_1.4.0                SingleCellMultiModal_1.4.0 
## [11] MultiAssayExperiment_1.18.0 SummarizedExperiment_1.22.0
## [13] Biobase_2.52.0              GenomicRanges_1.44.0       
## [15] GenomeInfoDb_1.28.0         IRanges_2.26.0             
## [17] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## [19] MatrixGenerics_1.4.0        matrixStats_0.58.0         
## 
## loaded via a namespace (and not attached):
##   [1] AnnotationHub_3.0.0           BiocFileCache_2.0.0          
##   [3] plyr_1.8.6                    BiocParallel_1.26.0          
##   [5] digest_0.6.27                 htmltools_0.5.1.1            
##   [7] viridis_0.6.1                 magick_2.7.2                 
##   [9] fansi_0.4.2                   magrittr_2.0.1               
##  [11] memoise_2.0.0                 ScaledMatrix_1.0.0           
##  [13] SpatialExperiment_1.2.0       cluster_2.1.2                
##  [15] limma_3.48.0                  Biostrings_2.60.0            
##  [17] R.utils_2.10.1                colorspace_2.0-1             
##  [19] blob_1.2.1                    rappdirs_0.3.3               
##  [21] xfun_0.23                     dplyr_1.0.6                  
##  [23] crayon_1.4.1                  RCurl_1.98-1.3               
##  [25] jsonlite_1.7.2                glue_1.4.2                   
##  [27] gtable_0.3.0                  zlibbioc_1.38.0              
##  [29] XVector_0.32.0                DelayedArray_0.18.0          
##  [31] BiocSingular_1.8.0            DropletUtils_1.12.0          
##  [33] Rhdf5lib_1.14.0               HDF5Array_1.20.0             
##  [35] scales_1.1.1                  DBI_1.1.1                    
##  [37] edgeR_3.34.0                  Rcpp_1.0.6                   
##  [39] viridisLite_0.4.0             xtable_1.8-4                 
##  [41] dqrng_0.3.0                   bit_4.0.4                    
##  [43] rsvd_1.0.5                    ResidualMatrix_1.2.0         
##  [45] metapod_1.0.0                 httr_1.4.2                   
##  [47] ellipsis_0.3.2                pkgconfig_2.0.3              
##  [49] R.methodsS3_1.8.1             farver_2.1.0                 
##  [51] sass_0.4.0                    uwot_0.1.10                  
##  [53] dbplyr_2.1.1                  locfit_1.5-9.4               
##  [55] utf8_1.2.1                    tidyselect_1.1.1             
##  [57] labeling_0.4.2                rlang_0.4.11                 
##  [59] later_1.2.0                   AnnotationDbi_1.54.0         
##  [61] munsell_0.5.0                 BiocVersion_3.13.1           
##  [63] tools_4.1.0                   cachem_1.0.5                 
##  [65] generics_0.1.0                RSQLite_2.2.7                
##  [67] ExperimentHub_2.0.0           evaluate_0.14                
##  [69] stringr_1.4.0                 fastmap_1.1.0                
##  [71] yaml_2.2.1                    knitr_1.33                   
##  [73] bit64_4.0.5                   purrr_0.3.4                  
##  [75] KEGGREST_1.32.0               sparseMatrixStats_1.4.0      
##  [77] mime_0.10                     R.oo_1.24.0                  
##  [79] compiler_4.1.0                beeswarm_0.3.1               
##  [81] filelock_1.0.2                curl_4.3.1                   
##  [83] png_0.1-7                     interactiveDisplayBase_1.30.0
##  [85] tibble_3.1.2                  statmod_1.4.36               
##  [87] bslib_0.2.5.1                 stringi_1.6.2                
##  [89] highr_0.9                     RSpectra_0.16-0              
##  [91] lattice_0.20-44               bluster_1.2.0                
##  [93] Matrix_1.3-3                  vctrs_0.3.8                  
##  [95] pillar_1.6.1                  lifecycle_1.0.0              
##  [97] rhdf5filters_1.4.0            BiocManager_1.30.15          
##  [99] jquerylib_0.1.4               RcppAnnoy_0.0.18             
## [101] BiocNeighbors_1.10.0          cowplot_1.1.1                
## [103] bitops_1.0-7                  irlba_2.3.3                  
## [105] httpuv_1.6.1                  R6_2.5.0                     
## [107] promises_1.2.0.1              gridExtra_2.3                
## [109] vipor_0.4.5                   codetools_0.2-18             
## [111] assertthat_0.2.1              rhdf5_2.36.0                 
## [113] rjson_0.2.20                  withr_2.4.2                  
## [115] GenomeInfoDbData_1.2.6        grid_4.1.0                   
## [117] beachmat_2.8.0                rmarkdown_2.8                
## [119] DelayedMatrixStats_1.14.0     shiny_1.6.0                  
## [121] ggbeeswarm_0.6.0